confusion <- function(yhat, y, quietly = FALSE){
if(!quietly) message("yhat is the vector of predicted outcomes, possibly a factor.
\n Sensitivity = (first level predicted) / (first level actual)
\n Specificity = (second level predicted) / (second level actual)")
if(!is.factor(y) & is.factor(yhat))
y <- as.factor(y)
if(!all.equal(levels(yhat), levels(y)))
stop("Factor levels of yhat and y do not match.")
confusion_mat <- table(yhat, y, deparse.level = 2)
stats <- data.frame(sensitivity = confusion_mat[1, 1]/sum(confusion_mat[, 1]), specificity
= confusion_mat[2, 2]/sum(confusion_mat[, 2]))
return(list(confusion_mat = confusion_mat, stats = stats))
}
fifadataraw <- read_csv("data.csv")
## Warning: Missing column names filled in: 'X1' [1]
## Parsed with column specification:
## cols(
## .default = col_character(),
## X1 = col_double(),
## ID = col_double(),
## Age = col_double(),
## Overall = col_double(),
## Potential = col_double(),
## Special = col_double(),
## `International Reputation` = col_double(),
## `Weak Foot` = col_double(),
## `Skill Moves` = col_double(),
## `Jersey Number` = col_double(),
## Crossing = col_double(),
## Finishing = col_double(),
## HeadingAccuracy = col_double(),
## ShortPassing = col_double(),
## Volleys = col_double(),
## Dribbling = col_double(),
## Curve = col_double(),
## FKAccuracy = col_double(),
## LongPassing = col_double(),
## BallControl = col_double()
## # ... with 24 more columns
## )
## See spec(...) for full column specifications.
fifadataraw1 <- fifadataraw
fifadataraw1 <- fifadataraw1 %>% mutate(League = ifelse(Club == "FC Bayern München" | Club == "Borussia Dortmund" | Club == "RB Leipzig" | Club == "Bayer 04 Leverkusen" | Club == "Borussia Mönchengladbach" | Club == "VfL Wolfsburg" | Club == "Eintracht Frankfurt" | Club == "SV Werd Bremen" | Club == "TSG 1899 Hoffenheim" | Club == "Fortuna Düsseldorf" | Club == "Hertha BSC" | Club == "1. FSV Mainz 05" | Club == "SC Freiburg" | Club == "FC Schalke 04" | Club == "FC Augsburg" | Club == "VfB Stuttgart" | Club == "Hannover 96" | Club == "FC Nürnberg", "Bundesliga", ifelse(Club == "FC Barcelona" | Club == "Real Madrid" | Club == "Atlético Madrid" | Club == "Valencia CF" | Club == "Getafe CF" | Club == "Sevilla FC" | Club == "RCD Espanyol" | Club == "Athletic Club de Bilbao" | Club == "Real Sociedad" | Club == "Real Betis" | Club == "Deportivo Alavés" | Club == "SD Eibar" | Club == "CD Leganés" | Club == "Villarreal CF" | Club == "Levante UD" | Club == "Real Valladolid CF" | Club == "RC Celta" | Club == "Girona FC" | Club == "SD Huesca" | Club == "Rayo Vallecano", "La Liga", ifelse(Club == "Manchester City" | Club == "Liverpool" | Club == "Chelsea" | Club == "Tottenham Hotspur"| Club == "Arsenal" | Club == "Manchester United" | Club == "Wolverhampton Wanderers" | Club == "Everton" | Club == "Leicester City" | Club == "West Ham United" | Club == "Watford" | Club == "Crystal Palace" | Club == "Newcastle United" | Club == "Bournemouth" | Club == "Burnley" | Club == "Southampton" | Club == "Brighton & Hove Albion" | Club == "Cardiff City" | Club == "Fulham" | Club == "Huddersfield Town", "Premier League", ifelse(Club == "Juventus" | Club == "Napoli" | Club == "Atalanta" | Club == "Inter" | Club == "Milan" | Club == "Roma" | Club == "Torino" | Club == "Lazio" | Club == "Sampdoria" | Club == "Bologna" | Club == "Sassuolo" | Club == "Udinese" | Club == "SPAL" | Club == "Parma" | Club == "Cagliari" | Club == "Fiorentina" | Club == "Genoa" | Club == "Empoli" | Club == "Frosinone" | Club == "Chievo Verona", "Serie A", ifelse(Club == "Paris Saint-Germain" | Club == "LOSC Lille" | Club == "Olympique Lyonnais" | Club == "AS Saint-Étienne" | Club == "Olympique de Marseille" | Club == "Montpellier HSC" | Club == "OGC Nice" | Club == "Stade de Reims" | Club == "Nîmes Olympique" | Club == "Stade Rennais FC" | Club == "RC Strasbourg Alsace" | Club == "FC Nantes" | Club == "Angers SCO" | Club == "FC Girondins de Bordeaux" | Club == "Amiens SC" | Club == "Toulouse Football Club" | Club == "AS Monaco" | Club == "Dijon FCO" | Club == "Stade Malherbe Caen" | Club == "En Avant de Guingamp", "Ligue 1", "NA"))))))
fifa_data2 <- fifadataraw1 %>% dplyr::select(-c(ID, Flag, Photo, 11, 29:54)) %>%
mutate(ValueMultiplier = ifelse(str_detect(Value, "K"), 1000, ifelse(str_detect(Value, "M"), 1000000, 1))) %>%
mutate(Value = as.numeric(str_extract(Value, "[[:digit:]]+\\.*[[:digit:]]*")) * ValueMultiplier) %>%
mutate(Position = ifelse(is.na(Position), "Unknown", Position))
fifa_data2 <- fifa_data2 %>%
mutate(WageMultiplier = ifelse(str_detect(Wage, "K"), 1000, ifelse(str_detect(Wage, "M"), 1000000, 1))) %>%
mutate(Wage = as.numeric(str_extract(Wage, "[[:digit:]]+\\.*[[:digit:]]*")) * WageMultiplier)
temp1 <- sapply(fifa_data2$Weight, parse_number)
fifa_data2$Weight <- as.numeric(temp1)
temp2 <- strsplit(fifa_data2$Height, "'")
for (i in 1:length(temp2)) {
temp2[[i]] <- as.numeric(temp2[[i]])
}
for (i in 1:length(temp2)) {
temp2[[i]] <- (temp2[[i]][1] *12) + temp2[[i]][2]
}
temp3 <- as.numeric(unlist(temp2))
fifa_data2$Height <- temp3
colleague <- c(4, 10:22, 59, 61, 62)
fifa_data3 <- fifa_data2[,-colleague]
fifa_data4 <- fifa_data3[!is.na(fifa_data3$League),]
set.seed(1234)
cv1 <- fold(fifa_data4, k = 5, cat_col = 'League', id_col = 'X1')
cv1 <- cv1 %>%
rename(fold = .folds)
testleague <- subset(cv1, fold == 5)
trainleague <- anti_join(cv1, testleague)
## Joining, by = c("X1", "Name", "Age", "Overall", "Potential", "Club", "Value", "Wage", "Height", "Weight", "Crossing", "Finishing", "HeadingAccuracy", "ShortPassing", "Volleys", "Dribbling", "Curve", "FKAccuracy", "LongPassing", "BallControl", "Acceleration", "SprintSpeed", "Agility", "Reactions", "Balance", "ShotPower", "Jumping", "Stamina", "Strength", "LongShots", "Aggression", "Interceptions", "Positioning", "Vision", "Penalties", "Composure", "Marking", "StandingTackle", "SlidingTackle", "GKDiving", "GKHandling", "GKKicking", "GKPositioning", "GKReflexes", "League", "fold")
fifa_data <- fifadataraw %>% dplyr::select(-c(ID, Flag, Photo, 11, 29:54)) %>%
mutate(ValueMultiplier = ifelse(str_detect(Value, "K"), 1000, ifelse(str_detect(Value, "M"), 1000000, 1))) %>%
mutate(Value = as.numeric(str_extract(Value, "[[:digit:]]+\\.*[[:digit:]]*")) * ValueMultiplier) %>%
mutate(Position = ifelse(is.na(Position), "Unknown", Position))
fifa_data <- fifa_data %>%
mutate(WageMultiplier = ifelse(str_detect(Wage, "K"), 1000, ifelse(str_detect(Wage, "M"), 1000000, 1))) %>%
mutate(Wage = as.numeric(str_extract(Wage, "[[:digit:]]+\\.*[[:digit:]]*")) * WageMultiplier)
off <- c('ST', 'CF', 'LF', 'LS', 'LW', 'RF', 'RS', 'RW', 'CAM', 'LAM', 'RAM')
def <- c('CB', 'LB', 'LCB', 'LWB', 'RB', 'RCB', 'RWB', 'LDM', 'CDM', 'RDM')
mid <- c('CM', 'LCM', 'RCM', 'RM', 'LM')
gk <- c('GK')
fifa_data <- fifa_data %>% mutate(pgroup = ifelse(Position %in% gk, "GK", ifelse(Position %in% off, "OFF", ifelse(Position %in% mid, "MID", ifelse(Position %in% def, "DEF", "NA")))))
##AGE
colpca <- c(1, 3, 5, 6, 25:58, 62)
fifa_data_pca <- na.omit(fifa_data[colpca])
pca_fifa <- prcomp(fifa_data_pca %>% dplyr::select(-X1, -Age, -pgroup), scale = TRUE)
screeplot(pca_fifa)
pca_scores <- pca_fifa$x
ldr3 <- pca_scores %>%
data.frame() %>%
mutate(Age = fifa_data_pca$Age, X1 = fifa_data_pca$X1, pgroup = fifa_data_pca$pgroup) %>%
dplyr::select(Age, pgroup, everything())
##VALUE
colpca <- c(1, 8, 5, 6, 25:58, 62)
fifa_data_pca <- na.omit(fifa_data[colpca])
pca_fifa <- prcomp(fifa_data_pca %>% dplyr::select(-X1, -Value, -pgroup), scale = TRUE)
screeplot(pca_fifa)

pca_scores <- pca_fifa$x
ldr8 <- pca_scores %>%
data.frame() %>%
mutate(Value = fifa_data_pca$Value, X1 = fifa_data_pca$X1, pgroup = fifa_data_pca$pgroup) %>%
dplyr::select(Value, pgroup, everything())
##WAGE
colpca <- c(1, 9, 5, 6, 25:58, 62)
fifa_data_pca <- na.omit(fifa_data[colpca])
pca_fifa <- prcomp(fifa_data_pca %>% dplyr::select(-X1, -Wage, -pgroup), scale = TRUE)
screeplot(pca_fifa)
pca_scores <- pca_fifa$x
ldr9 <- pca_scores %>%
data.frame() %>%
mutate(Wage = fifa_data_pca$Wage, X1 = fifa_data_pca$X1, pgroup = fifa_data_pca$pgroup) %>%
dplyr::select(Wage,pgroup, everything())
#SPECIAL
colpca <- c(1, 10, 5, 6, 25:58,62)
fifa_data_pca <- na.omit(fifa_data[colpca])
pca_fifa <- prcomp(fifa_data_pca %>% dplyr::select(-X1, -Special, -pgroup), scale = TRUE)
screeplot(pca_fifa)
pca_scores <- pca_fifa$x
ldr10 <- pca_scores %>%
data.frame() %>%
mutate(Special = fifa_data_pca$Special, X1 = fifa_data_pca$X1, pgroup = fifa_data_pca$pgroup) %>%
dplyr::select(Special, pgroup, everything())
ldr3.1 <- ldr3[-which(ldr3$pgroup == "NA"),]
colors <- c("#999999", "#E69F00", "#56B4E9", "#FF0000")
colors <- colors[as.factor(ldr3.1$pgroup)]
s3d <- scatterplot3d(ldr3.1[,c(3,4,5)], pch = 1, color=colors)

#legend("left", legend = levels(ldr3.1$pgroup), col = colors, pch = 16)
colors <- c("#999999", "#E69F00", "#56B4E9", "#000000", "#FF0000")
colors <- colors[as.factor(ldr8$pgroup)]
s4d <- scatterplot3d(ldr8[,c(3,4,1)], pch = 1, color=colors)
legend("left", legend = levels(as.factor(ldr8$pgroup)), col = colors, pch = 16)

colors <- c("#999999", "#E69F00", "#56B4E9", "#000000", "#FF0000")
colors <- colors[as.factor(ldr9$pgroup)]
scatterplot3d(ldr9[,c(3,4,1)], pch = 1, color=colors)
legend("left", legend = levels(as.factor(ldr9$pgroup)),
col = colors, pch = 16)

colors <- c("#999999", "#E69F00", "#56B4E9", "#000000", "#FF0000")
colors <- colors[as.factor(ldr10$pgroup)]
scatterplot3d(ldr10[,c(3,4,1)], pch = 1, color=colors)
legend("left", legend = levels(as.factor(ldr10$pgroup)),
col = colors, pch = 16)

ggplot(ldr3.1, aes(x = PC1, y = PC2, color = ldr3.1$pgroup)) +geom_vline(xintercept = 0) +
geom_hline(yintercept = 0) +geom_text(aes(label = X1), size = 2) +scale_x_continuous(breaks = -10:10) +coord_cartesian(xlim = c(-10, 15)) +theme_light()

fifa_data <- fifa_data %>% mutate(AgeGroup = ifelse(Age <= 22, "young", ifelse(Age > 22 & Age <= 27, "middle", ifelse(Age >= 28, "old", "NA"))))
fifa_data_pagegroup <- transform(fifa_data, pagegroup = paste(pgroup,AgeGroup))
head(fifa_data_pagegroup)
## X1 Name Age Nationality Overall Potential
## 1 0 L. Messi 31 Argentina 94 94
## 2 1 Cristiano Ronaldo 33 Portugal 94 94
## 3 2 Neymar Jr 26 Brazil 92 93
## 4 3 De Gea 27 Spain 91 93
## 5 4 K. De Bruyne 27 Belgium 91 92
## 6 5 E. Hazard 27 Belgium 91 91
## Club Value Wage Special Preferred.Foot
## 1 FC Barcelona 110500000 565000 2202 Left
## 2 Juventus 77000000 405000 2228 Right
## 3 Paris Saint-Germain 118500000 290000 2143 Right
## 4 Manchester United 72000000 260000 1471 Right
## 5 Manchester City 102000000 355000 2281 Right
## 6 Chelsea 93000000 340000 2142 Right
## International.Reputation Weak.Foot Skill.Moves Work.Rate Body.Type
## 1 5 4 4 Medium/ Medium Messi
## 2 5 4 5 High/ Low C. Ronaldo
## 3 5 5 5 High/ Medium Neymar
## 4 4 3 1 Medium/ Medium Lean
## 5 4 5 4 High/ High Normal
## 6 4 4 4 High/ Medium Normal
## Real.Face Position Jersey.Number Joined Loaned.From
## 1 Yes RF 10 Jul 1, 2004 <NA>
## 2 Yes ST 7 Jul 10, 2018 <NA>
## 3 Yes LW 10 Aug 3, 2017 <NA>
## 4 Yes GK 1 Jul 1, 2011 <NA>
## 5 Yes RCM 7 Aug 30, 2015 <NA>
## 6 Yes LF 10 Jul 1, 2012 <NA>
## Contract.Valid.Until Height Weight Crossing Finishing HeadingAccuracy
## 1 2021 5'7 159lbs 84 95 70
## 2 2022 6'2 183lbs 84 94 89
## 3 2022 5'9 150lbs 79 87 62
## 4 2020 6'4 168lbs 17 13 21
## 5 2023 5'11 154lbs 93 82 55
## 6 2020 5'8 163lbs 81 84 61
## ShortPassing Volleys Dribbling Curve FKAccuracy LongPassing BallControl
## 1 90 86 97 93 94 87 96
## 2 81 87 88 81 76 77 94
## 3 84 84 96 88 87 78 95
## 4 50 13 18 21 19 51 42
## 5 92 82 86 85 83 91 91
## 6 89 80 95 83 79 83 94
## Acceleration SprintSpeed Agility Reactions Balance ShotPower Jumping
## 1 91 86 91 95 95 85 68
## 2 89 91 87 96 70 95 95
## 3 94 90 96 94 84 80 61
## 4 57 58 60 90 43 31 67
## 5 78 76 79 91 77 91 63
## 6 94 88 95 90 94 82 56
## Stamina Strength LongShots Aggression Interceptions Positioning Vision
## 1 72 59 94 48 22 94 94
## 2 88 79 93 63 29 95 82
## 3 81 49 82 56 36 89 87
## 4 43 64 12 38 30 12 68
## 5 90 75 91 76 61 87 94
## 6 83 66 80 54 41 87 89
## Penalties Composure Marking StandingTackle SlidingTackle GKDiving
## 1 75 96 33 28 26 6
## 2 85 95 28 31 23 7
## 3 81 94 27 24 33 9
## 4 40 68 15 21 13 90
## 5 79 88 68 58 51 15
## 6 86 91 34 27 22 11
## GKHandling GKKicking GKPositioning GKReflexes Release.Clause
## 1 11 15 14 8 €226.5M
## 2 11 15 14 11 €127.1M
## 3 9 15 15 11 €228.1M
## 4 85 87 88 94 €138.6M
## 5 13 5 10 13 €196.4M
## 6 12 6 8 8 €172.1M
## ValueMultiplier WageMultiplier pgroup AgeGroup pagegroup
## 1 1e+06 1000 OFF old OFF old
## 2 1e+06 1000 OFF old OFF old
## 3 1e+06 1000 OFF middle OFF middle
## 4 1e+06 1000 GK middle GK middle
## 5 1e+06 1000 MID middle MID middle
## 6 1e+06 1000 OFF middle OFF middle
extract <- function(x){
regexp <- "[[:digit:]]+"
str_extract(x, regexp)
}
temp1 <- sapply(fifa_data_pagegroup$Weight, extract)
fifa_data_pagegroup$Weight <- as.numeric(temp1)
temp2 <- strsplit(fifa_data_pagegroup$Height, "'")
for (i in 1:length(temp2)) {
temp2[[i]] <- as.numeric(temp2[[i]])
}
for (i in 1:length(temp2)) {
temp2[[i]] <- (temp2[[i]][1] *12) + temp2[[i]][2]
}
temp3 <- as.numeric(unlist(temp2))
fifa_data_pagegroup$Height <- temp3
colors2 <- c("palegreen1", "lightskyblue1", "honeydew2", "cyan3", "orchid", "grey55", "papayawhip", "mediumturquoise", "darkslategrey", "salmon2", "gold1", "tomato3")
colpca <- c(1, 8, 5, 6, 25:58, 64)
fifa_data_pca2 <- na.omit(fifa_data_pagegroup[colpca])
pca_fifa2 <- prcomp(fifa_data_pca2 %>% dplyr::select(-X1, -Value, -pagegroup), scale = TRUE)
screeplot(pca_fifa2)

pca_scores2 <- pca_fifa2$x
ldr8_2 <- pca_scores2 %>%
data.frame() %>%
mutate(Value = fifa_data_pca2$Value, X1 = fifa_data_pca2$X1, pagegroup = fifa_data_pca2$pagegroup) %>%
dplyr::select(Value, pagegroup, everything())
colors2 <- colors2[as.factor(ldr8_2$pagegroup)]
s4d <- scatterplot3d(ldr8[,c(3,4,1)], pch = 1, color=colors2)
legend("right", legend = levels(as.factor(ldr8_2$pagegroup)),
col = colors2, pch = 16)

ggplot(ldr8_2, aes(x = PC1, y = PC2, color = ldr8_2$pagegroup)) +geom_vline(xintercept = 0) +
geom_hline(yintercept = 0) +geom_text(aes(label = X1), size = 2) +scale_x_continuous(breaks = -10:10) +coord_cartesian(xlim = c(-10, 15)) +theme_light()

fifa_data_pca2 <- fifa_data_pca2[-c(1,2)]
ldahist1 <- fifa_data_pca2[which(fifa_data_pca2$pagegroup == "OFF young"),]
ldahist2 <- fifa_data_pca2[which(fifa_data_pca2$pagegroup == "OFF middle"),]
ldahist3 <- fifa_data_pca2[which(fifa_data_pca2$pagegroup == "OFF old"),]
ldahist4 <- fifa_data_pca2[which(fifa_data_pca2$pagegroup == "DEF young"),]
ldahist5 <- fifa_data_pca2[which(fifa_data_pca2$pagegroup == "DEF middle"),]
ldahist6 <- fifa_data_pca2[which(fifa_data_pca2$pagegroup == "DEF old"),]
ldahist7 <- fifa_data_pca2[which(fifa_data_pca2$pagegroup == "GK young"),]
ldahist8 <- fifa_data_pca2[which(fifa_data_pca2$pagegroup == "GK middle"),]
ldahist9 <- fifa_data_pca2[which(fifa_data_pca2$pagegroup == "GK old"),]
ldahist10 <- fifa_data_pca2[which(fifa_data_pca2$pagegroup == "MID young"),]
ldahist11<- fifa_data_pca2[which(fifa_data_pca2$pagegroup == "MID middle"),]
ldahist12 <- fifa_data_pca2[which(fifa_data_pca2$pagegroup == "MID old"),]
ldahist1 %>% gather()
ldahist2 %>% gather()
ldahist3 %>% gather()
ldahist4 %>% gather()
ldahist5 %>% gather()
ldahist6 %>% gather()
ldahist7 %>% gather()
ldahist8 %>% gather()
ldahist9 %>% gather()
ldahist10 %>% gather()
ldahist11 %>% gather()
ldahist12 %>% gather()
ggplot(gather(ldahist1),aes(value)) + geom_bar(bins = 100) + facet_wrap(~key, nrow = 6, ncol = 7, scales = 'free')
ggplot(gather(ldahist2),aes(value)) + geom_bar(bins = 100) + facet_wrap(~key, nrow = 6, ncol = 7, scales = 'free')
ggplot(gather(ldahist3),aes(value)) + geom_bar(bins = 100) + facet_wrap(~key, nrow = 6, ncol = 7, scales = 'free')
ggplot(gather(ldahist4),aes(value)) + geom_bar(bins = 100) + facet_wrap(~key, nrow = 6, ncol = 7, scales = 'free')
ggplot(gather(ldahist5),aes(value)) + geom_bar(bins = 100) + facet_wrap(~key, nrow = 6, ncol = 7, scales = 'free')
ggplot(gather(ldahist6),aes(value)) + geom_bar(bins = 100) + facet_wrap(~key, nrow = 6, ncol = 7, scales = 'free')
ggplot(gather(ldahist7),aes(value)) + geom_bar(bins = 100) + facet_wrap(~key, nrow = 6, ncol = 7, scales = 'free')
ggplot(gather(ldahist8),aes(value)) + geom_bar(bins = 100) + facet_wrap(~key, nrow = 6, ncol = 7, scales = 'free')
ggplot(gather(ldahist9),aes(value)) + geom_bar(bins = 100) + facet_wrap(~key, nrow = 6, ncol = 7, scales = 'free')
ggplot(gather(ldahist10),aes(value)) + geom_bar(bins = 100) + facet_wrap(~key, nrow = 6, ncol = 7, scales = 'free')
ggplot(gather(ldahist11),aes(value)) + geom_bar(bins = 100) + facet_wrap(~key, nrow = 6, ncol = 7, scales = 'free')
ggplot(gather(ldahist12),aes(value)) + geom_bar(bins = 100) + facet_wrap(~key, nrow = 6, ncol = 7, scales = 'free')
colpca1 <- c(1, 3, 5, 6, 8, 9, 18, 23:58, 64)
fifa_data_pagegroup <- fifa_data_pagegroup[-which(fifa_data_pagegroup$pagegroup == 'NA young'),]
fifa_data_pagegroup <- fifa_data_pagegroup[-which(fifa_data_pagegroup$pagegroup == 'NA middle'),]
fifa_data_pagegroup <- fifa_data_pagegroup[-which(fifa_data_pagegroup$pagegroup == 'NA old'),]
fifa_data_pagegroup <- na.omit(fifa_data_pagegroup[colpca1])
fifa_data_pagegroup1 <- fifa_data_pagegroup
set.seed(1234)
cv <- fold(fifa_data_pagegroup1, k = 5, cat_col = 'pagegroup', id_col = 'X1')
cv <- cv %>%
rename(fold = .folds)
test <- subset(cv, fold == 5)
train <- anti_join(cv, test)
## Joining, by = c("X1", "Age", "Overall", "Potential", "Value", "Wage", "Position", "Height", "Weight", "Crossing", "Finishing", "HeadingAccuracy", "ShortPassing", "Volleys", "Dribbling", "Curve", "FKAccuracy", "LongPassing", "BallControl", "Acceleration", "SprintSpeed", "Agility", "Reactions", "Balance", "ShotPower", "Jumping", "Stamina", "Strength", "LongShots", "Aggression", "Interceptions", "Positioning", "Vision", "Penalties", "Composure", "Marking", "StandingTackle", "SlidingTackle", "GKDiving", "GKHandling", "GKKicking", "GKPositioning", "GKReflexes", "pagegroup", "fold")
head(fifa_data_pagegroup1)
## X1 Age Overall Potential Value Wage Position Height Weight
## 1 0 31 94 94 110500000 565000 RF 67 159
## 2 1 33 94 94 77000000 405000 ST 74 183
## 3 2 26 92 93 118500000 290000 LW 69 150
## 4 3 27 91 93 72000000 260000 GK 76 168
## 5 4 27 91 92 102000000 355000 RCM 71 154
## 6 5 27 91 91 93000000 340000 LF 68 163
## Crossing Finishing HeadingAccuracy ShortPassing Volleys Dribbling Curve
## 1 84 95 70 90 86 97 93
## 2 84 94 89 81 87 88 81
## 3 79 87 62 84 84 96 88
## 4 17 13 21 50 13 18 21
## 5 93 82 55 92 82 86 85
## 6 81 84 61 89 80 95 83
## FKAccuracy LongPassing BallControl Acceleration SprintSpeed Agility
## 1 94 87 96 91 86 91
## 2 76 77 94 89 91 87
## 3 87 78 95 94 90 96
## 4 19 51 42 57 58 60
## 5 83 91 91 78 76 79
## 6 79 83 94 94 88 95
## Reactions Balance ShotPower Jumping Stamina Strength LongShots
## 1 95 95 85 68 72 59 94
## 2 96 70 95 95 88 79 93
## 3 94 84 80 61 81 49 82
## 4 90 43 31 67 43 64 12
## 5 91 77 91 63 90 75 91
## 6 90 94 82 56 83 66 80
## Aggression Interceptions Positioning Vision Penalties Composure Marking
## 1 48 22 94 94 75 96 33
## 2 63 29 95 82 85 95 28
## 3 56 36 89 87 81 94 27
## 4 38 30 12 68 40 68 15
## 5 76 61 87 94 79 88 68
## 6 54 41 87 89 86 91 34
## StandingTackle SlidingTackle GKDiving GKHandling GKKicking GKPositioning
## 1 28 26 6 11 15 14
## 2 31 23 7 11 15 14
## 3 24 33 9 9 15 15
## 4 21 13 90 85 87 88
## 5 58 51 15 13 5 10
## 6 27 22 11 12 6 8
## GKReflexes pagegroup
## 1 8 OFF old
## 2 11 OFF old
## 3 11 OFF middle
## 4 94 GK middle
## 5 13 MID middle
## 6 8 OFF middle
ldacol <- c(1, 3:6, 8:44)
trainlda <- train[ldacol]
testlda <- test[ldacol]
testlda1<- as.matrix(testlda)
mlda <- lda(pagegroup ~ ., data = trainlda[,-1])
## Warning in lda.default(x, grouping, ...): groups NA middle NA old NA young
## are empty
lda.pred <- predict(mlda, newdata = testlda[,-1])
lda.pred1 <- lda.pred[[1]]
lda.pred1 <- as.matrix(lda.pred1)
confusion(as.vector(lda.pred1), as.vector(testlda1[,42]), quietly = FALSE)
## yhat is the vector of predicted outcomes, possibly a factor.
##
## Sensitivity = (first level predicted) / (first level actual)
##
## Specificity = (second level predicted) / (second level actual)
## $confusion_mat
## y
## yhat DEF middle DEF old DEF young GK middle GK old GK young
## DEF middle 373 72 57 0 0 0
## DEF old 98 365 0 0 0 0
## DEF young 30 0 332 0 0 0
## GK middle 0 0 0 64 24 20
## GK old 0 0 0 37 126 0
## GK young 0 0 0 25 2 109
## MID middle 22 10 5 0 0 0
## MID old 13 49 1 0 0 0
## MID young 2 0 31 0 0 0
## OFF middle 1 0 0 0 0 0
## OFF old 0 0 0 0 0 0
## OFF young 0 0 1 0 0 0
## y
## yhat MID middle MID old MID young OFF middle OFF old OFF young
## DEF middle 40 5 10 3 0 0
## DEF old 14 47 0 0 0 0
## DEF young 5 0 28 0 0 3
## GK middle 0 0 0 0 0 0
## GK old 0 0 0 0 0 0
## GK young 0 0 0 0 0 0
## MID middle 141 25 17 62 16 8
## MID old 29 104 0 12 36 0
## MID young 20 0 205 9 0 67
## OFF middle 67 14 10 159 52 31
## OFF old 10 22 0 54 141 1
## OFF young 5 0 63 13 0 217
##
## $stats
## sensitivity specificity
## 1 0.6920223 0.7358871
fifa_data_pagegroup2 <- transform(fifa_data, pagegroup = paste(pgroup,AgeGroup))
extract <- function(x){
regexp <- "[[:digit:]]+"
str_extract(x, regexp)
}
temp1 <- sapply(fifa_data_pagegroup2$Weight, extract)
fifa_data_pagegroup2$Weight <- as.numeric(temp1)
temp2 <- strsplit(fifa_data_pagegroup2$Height, "'")
for (i in 1:length(temp2)) {
temp2[[i]] <- as.numeric(temp2[[i]])
}
for (i in 1:length(temp2)) {
temp2[[i]] <- (temp2[[i]][1] *12) + temp2[[i]][2]
}
temp3 <- as.numeric(unlist(temp2))
fifa_data_pagegroup2$Height <- temp3
colpca1 <- c(1, 2, 5, 6, 8, 9, 18, 23:58, 64)
fifa_data_pagegroup2 <- fifa_data_pagegroup2[-which(fifa_data_pagegroup2$pagegroup == 'NA young'),]
fifa_data_pagegroup2 <- fifa_data_pagegroup2[-which(fifa_data_pagegroup2$pagegroup == 'NA middle'),]
fifa_data_pagegroup2 <- fifa_data_pagegroup2[-which(fifa_data_pagegroup2$pagegroup == 'NA old'),]
fifa_data_pagegroup2 <- na.omit(fifa_data_pagegroup2[colpca1])
fifa_data_pagegroup3 <- fifa_data_pagegroup2
fifa_data_pagegroup3$pagegroup <- droplevels(fifa_data_pagegroup3$pagegroup)
set.seed(1234)
cv <- fold(fifa_data_pagegroup3, k = 5, cat_col = 'pagegroup', id_col = 'X1')
cv <- cv %>%
rename(fold = .folds)
test1 <- subset(cv, fold == 5)
train1 <- anti_join(cv, test1)
## Joining, by = c("X1", "Name", "Overall", "Potential", "Value", "Wage", "Position", "Height", "Weight", "Crossing", "Finishing", "HeadingAccuracy", "ShortPassing", "Volleys", "Dribbling", "Curve", "FKAccuracy", "LongPassing", "BallControl", "Acceleration", "SprintSpeed", "Agility", "Reactions", "Balance", "ShotPower", "Jumping", "Stamina", "Strength", "LongShots", "Aggression", "Interceptions", "Positioning", "Vision", "Penalties", "Composure", "Marking", "StandingTackle", "SlidingTackle", "GKDiving", "GKHandling", "GKKicking", "GKPositioning", "GKReflexes", "pagegroup", "fold")
ldacol1 <- c(1, 3:6, 8:45)
trainlda2 <- train1[ldacol1]
testlda2 <- test1[ldacol1]
set.seed(1234)
knn_models <- list()
ktrain <- trainlda[,-c(1,42)]
ktest <- testlda[,-c(1,42)]
for(i in 1:25) {
knn_models[[i]] <- knn(ktrain, ktest, cl = trainlda$pagegroup, k = i)
}
knn_results <- lapply(knn_models, FUN = function(x) {
return(confusion(x, testlda$pagegroup, quietly = TRUE)$stats)
}
)
knn_results <- bind_rows(knn_results)
knn_results$K <- 1:25
ggplot(knn_results, aes(x = specificity, y = sensitivity, label = K)) + geom_point() + geom_text(hjust = 2)

set.seed(1)
rf.fifa <- randomForest(pagegroup~., data = trainlda2[,-c(1,43)], mtry = 7, importance = TRUE, ntree = 500)
yhat.bag <- predict(rf.fifa, newdata = testlda2[,-c(1,43)])
confusion(yhat.bag, testlda2$pagegroup)
## yhat is the vector of predicted outcomes, possibly a factor.
##
## Sensitivity = (first level predicted) / (first level actual)
##
## Specificity = (second level predicted) / (second level actual)
## $confusion_mat
## y
## yhat DEF middle DEF old DEF young GK middle GK old GK young
## DEF middle 368 60 68 0 0 0
## DEF old 100 409 0 0 0 0
## DEF young 49 1 324 0 0 0
## GK middle 0 0 0 77 18 12
## GK old 0 0 0 28 133 1
## GK young 0 0 0 21 1 116
## MID middle 15 7 4 0 0 0
## MID old 2 16 0 0 0 0
## MID young 4 1 30 0 0 0
## OFF middle 1 1 0 0 0 0
## OFF old 0 1 0 0 0 0
## OFF young 0 0 1 0 0 0
## y
## yhat MID middle MID old MID young OFF middle OFF old OFF young
## DEF middle 45 5 16 3 0 0
## DEF old 21 70 0 0 0 0
## DEF young 3 0 30 0 0 3
## GK middle 0 0 0 0 0 0
## GK old 0 0 0 0 0 0
## GK young 0 0 0 0 0 0
## MID middle 156 39 23 62 19 7
## MID old 10 62 0 7 25 1
## MID young 36 3 205 13 0 61
## OFF middle 37 10 7 165 67 32
## OFF old 10 26 0 35 133 0
## OFF young 13 2 52 27 1 223
##
## $stats
## sensitivity specificity
## 1 0.6827458 0.8245968
varImpPlot(rf.fifa)

league <- read_csv("league - league.csv")
## Warning: Missing column names filled in: 'X1' [1]
## Warning: Duplicated column names deduplicated: 'X1' => 'X1_1' [2]
## Parsed with column specification:
## cols(
## .default = col_double(),
## Name = col_character(),
## Club = col_character(),
## League = col_character()
## )
## See spec(...) for full column specifications.
league <- na.omit(league)
league3 <- league %>% mutate(perf = ifelse(Whoscored <= 6, "poor", ifelse(Whoscored >= 6.01 & Whoscored <= 6.50, "below", ifelse(Whoscored >= 6.51 & Whoscored <= 7.00, "average", ifelse(Whoscored >= 7.01 & Whoscored <= 7.5, "above", "excellent")))))
colpcal <- c(2:3, 8:10, 47)
league2 <- league[,-colpcal]
league3 <- league3[,-colpcal]
pca_league <- prcomp(league3 %>% dplyr::select(-X1, -Whoscored, -perf), scale = TRUE)
screeplot(pca_league)

pca_scoresleague <- pca_league$x
ldrleague <- pca_scoresleague %>%
data.frame() %>%
mutate(X1 = league2$X1, perf = league3$perf) %>%
dplyr::select(X1, perf, everything())
colors3 <- c("blue", "red", "green", "yellow", "purple")
colors3 <- colors3[as.factor(ldrleague$perf)]
s6d <- scatterplot3d(ldrleague[,c(3,4,5)], pch = 1, color=colors3)

#legend("right", legend = levels(as.factor(ldrleague$perf)), col = colors3, pch = 16)
ggplot(ldrleague, aes(x = PC1, y = PC2, color = ldrleague$perf)) +geom_vline(xintercept = 0) +
geom_hline(yintercept = 0) +geom_text(aes(label = X1), size = 2) +scale_x_continuous(breaks = -10:10) +coord_cartesian(xlim = c(-15, 7)) +theme_light()

set.seed(1234)
league3 <- subset(league3, select = -c(1,42))
train_size <- floor(0.65 * nrow (league3))
subset <- sample(seq_len(nrow(league3)), size = train_size)
train_league <- league3[subset,]
test_league <- league3[-subset,]
lm.mod <- lm(Whoscored~., data = train_league)
MSPE = mean((test_league$Whoscored - predict.lm(lm.mod, test_league)) ^ 2)
MSPE
## [1] 0.08237169
train_y <- predict(lm.mod, test_league)
plot(lm.mod)




#Ridge
ytr <- league3$Whoscored
xtr <- model.matrix(Whoscored~., league3)[,-1]
cvRidge.out <- cv.glmnet(xtr,ytr,alpha=0,nfolds=5)
plot(cvRidge.out)

mse.minridge <- min(cvRidge.out$cvm)
mse.minridge
## [1] 0.07795221
cat("CV Errors", cvRidge.out$cvm,fill=TRUE)
## CV Errors 0.111561 0.1110418 0.1109166 0.1108502 0.1107778 0.1106989
## 0.110613 0.1105194 0.1104176 0.1103069 0.1101868 0.1100563 0.109915
## 0.1097618 0.1095961 0.109417 0.1092237 0.1090153 0.1087909 0.1085497
## 0.1082907 0.1080132 0.1077161 0.1073987 0.1070602 0.1066997 0.1063167
## 0.1059103 0.1054801 0.1050255 0.1045461 0.1040418 0.1035122 0.1029574
## 0.1023775 0.1017729 0.1011442 0.1004919 0.09981642 0.09912052 0.09840517
## 0.09767211 0.09692344 0.09616156 0.09538914 0.09460906 0.09382443
## 0.09303854 0.0922548 0.09147671 0.0907078 0.08995154 0.08921132 0.08849029
## 0.08779134 0.087117 0.08646944 0.0858504 0.08526122 0.08470293 0.08417668
## 0.08368156 0.08321718 0.08278326 0.08237933 0.0820042 0.08165664
## 0.08133527 0.0810386 0.08076506 0.0805131 0.08028139 0.08006769 0.07987125
## 0.07969052 0.07952426 0.07937127 0.07923059 0.07910063 0.07898119
## 0.07887102 0.07877011 0.07867682 0.07859132 0.078513 0.07844158 0.07837598
## 0.07831609 0.07826204 0.07821293 0.07816863 0.07812932 0.078094 0.07806312
## 0.07803554 0.07801188 0.07799149 0.07797433 0.07795997 0.07795221
cat("Lambda with smallest CV Error",
cvRidge.out$lambda[which.min(cvRidge.out$cvm)],fill=TRUE)
## Lambda with smallest CV Error 0.01714158
cat("Coefficients", as.numeric(coef(cvRidge.out, s = "lambda.min")),fill=TRUE)
## Coefficients 3.549959 -0.001595308 0.01891555 0.009841154 0.007227757
## 0.0003798079 0.0005687167 -0.0002035215 0.0004656983 -0.0008915139
## -0.0001443171 -6.473863e-05 0.0003366044 0.00134886 -9.463043e-05
## 0.0008776562 0.001776189 -0.0008266312 -0.001059408 0.003611604
## -6.285275e-05 -0.001681204 -0.0007804197 0.003115542 0.0004154781
## -0.0006916806 -0.0007262519 0.00104486 0.0003364664 -0.001030838
## -0.0002385867 -0.0005185592 -5.907338e-05 3.870985e-05 0.000269259
## 0.0006258105 0.0004933196 0.001502573 -0.0007679195 -0.00119968
cat("Number of Zero Coefficients",
sum(abs(coef(cvRidge.out))<1e-8),fill=TRUE)
## Number of Zero Coefficients 0
#Lasso
cvLasso.out <- cv.glmnet(xtr,ytr,alpha=1,nfolds=5)
plot(cvLasso.out)

mse.minlasso <- min(cvLasso.out$cvm)
mse.minlasso
## [1] 0.07817806
cat("CV Errors", cvLasso.out$cvm,fill=TRUE)
## CV Errors 0.1113083 0.1067624 0.102601 0.09914565 0.09627655 0.09389419
## 0.09191595 0.09026808 0.08884736 0.08761073 0.08653092 0.08559207
## 0.08479483 0.08412937 0.08350497 0.08284906 0.08226885 0.08175441
## 0.0813173 0.08094293 0.08063297 0.08038348 0.0801821 0.080014 0.07985879
## 0.0797227 0.07959319 0.07946063 0.07933221 0.07919785 0.07905756 0.0789187
## 0.07880565 0.07871796 0.07864972 0.07859317 0.07854842 0.07851653
## 0.07848239 0.07843392 0.07839096 0.07835601 0.07832325 0.07829488
## 0.07827621 0.07826406 0.07825922 0.07824993 0.07823377 0.07823087
## 0.07822476 0.07821548 0.07819967 0.07818264 0.07817806 0.07818283
## 0.07819662 0.0782145 0.0782365 0.07826205 0.07828146 0.07830038 0.07831751
## 0.07834069 0.07836349 0.07838468 0.07840511 0.07842496 0.0784397
## 0.07845981 0.07847184 0.07848991 0.07850071 0.07851724 0.0785262
## 0.07853711 0.07855195 0.0785594 0.07856619 0.07857464 0.07858211
## 0.07858922 0.07859556 0.07860544 0.07861099 0.078615 0.07861823 0.07862092
cat("Lambda with smallest CV Error",
cvLasso.out$lambda[which.min(cvLasso.out$cvm)],fill=TRUE)
## Lambda with smallest CV Error 0.001127801
cat("Number of Zero Coefficients",sum(abs(coef(cvLasso.out))<1e-8),
fill=TRUE)
## Number of Zero Coefficients 35
coef(cvLasso.out)
## 40 x 1 sparse Matrix of class "dgCMatrix"
## 1
## (Intercept) 4.7214454440
## Age .
## Overall 0.0197585707
## Potential 0.0051389722
## Height .
## Weight .
## Crossing .
## Finishing .
## HeadingAccuracy .
## ShortPassing .
## Volleys .
## Dribbling .
## Curve .
## FKAccuracy .
## LongPassing .
## BallControl .
## Acceleration .
## SprintSpeed .
## Agility .
## Reactions .
## Balance .
## ShotPower .
## Jumping .
## Stamina 0.0001865443
## Strength .
## LongShots .
## Aggression .
## Interceptions 0.0002555665
## Positioning .
## Vision .
## Penalties .
## Composure .
## Marking .
## StandingTackle .
## SlidingTackle .
## GKDiving .
## GKHandling .
## GKKicking .
## GKPositioning .
## GKReflexes .
pcrout <- pcr(Whoscored~., data = train_league, scale=TRUE, validation="CV")
x_train = model.matrix(Whoscored~., train_league)[,-1]
x_test = model.matrix(Whoscored~., test_league)[,-1]
y_train = train_league[,1]
y_test = as.matrix(test_league[,1])
pcr_pred = predict(pcrout, x_test, ncomp=6)
pcr_pred1 = as.matrix(pcr_pred)
mean((pcr_pred1-y_test)^2)
## [1] 0.08442751
validationplot(pcrout,val.type="MSEP")

plsout <- plsr(Whoscored~., data = train_league, scale=TRUE, validation="CV")
x_train = model.matrix(Whoscored~., train_league)[,-1]
x_test = model.matrix(Whoscored~., test_league)[,-1]
y_train = train_league[,1]
y_test = as.matrix(test_league[,1])
pls_pred = predict(plsout, x_test, ncomp=6)
pls_pred1 = as.matrix(pls_pred)
mean((pls_pred1-y_test)^2)
## [1] 0.08240741
validationplot(plsout, val.type="MSEP")

#Elastic Net
set.seed(1234)
cv_5 = trainControl(method = "cv", number = 5)
cvElastic.out <- train(Whoscored~., data = league3, method = "glmnet",trControl = cv_5)
cvElastic.out
## glmnet
##
## 2283 samples
## 39 predictor
##
## No pre-processing
## Resampling: Cross-Validated (5 fold)
## Summary of sample sizes: 1826, 1827, 1825, 1828, 1826
## Resampling results across tuning parameters:
##
## alpha lambda RMSE Rsquared MAE
## 0.10 0.0003428315 0.2813807 0.2908060 0.2176521
## 0.10 0.0034283153 0.2807828 0.2932995 0.2173039
## 0.10 0.0342831525 0.2805064 0.2952396 0.2176827
## 0.55 0.0003428315 0.2812248 0.2914896 0.2175642
## 0.55 0.0034283153 0.2803356 0.2952431 0.2172293
## 0.55 0.0342831525 0.2830744 0.2871470 0.2204476
## 1.00 0.0003428315 0.2810755 0.2921273 0.2174833
## 1.00 0.0034283153 0.2800792 0.2965043 0.2171685
## 1.00 0.0342831525 0.2856996 0.2805554 0.2229247
##
## RMSE was used to select the optimal model using the smallest value.
## The final values used for the model were alpha = 1 and lambda
## = 0.003428315.
plot(cvElastic.out)
